{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "whjsJasuhstV"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/jeffheaton/app_generative_ai/blob/main/t81_559_class_03_3_text_summary.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "euOZxlIMhstX"
   },
   "source": [
    "# T81-559: Applications of Generative Artificial Intelligence\n",
    "**Module 3: Large Language Models**\n",
    "* Instructor: [Jeff Heaton](https://sites.wustl.edu/jeffheaton/), McKelvey School of Engineering, [Washington University in St. Louis](https://engineering.wustl.edu/Programs/Pages/default.aspx)\n",
    "* For more information visit the [class website](https://sites.wustl.edu/jeffheaton/t81-558/)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "d4Yov72PhstY"
   },
   "source": [
    "# Module 3 Material\n",
    "\n",
    "* Part 3.1: Foundation Models [[Video]](https://www.youtube.com/watch?v=Gb0tk5qq1fA) [[Notebook]](t81_559_class_03_1_llm.ipynb)\n",
    "* Part 3.2: Text Generation [[Video]](https://www.youtube.com/watch?v=lB97Lqt7q58) [[Notebook]](t81_559_class_03_2_text_gen.ipynb)\n",
    "* **Part 3.3: Text Summarization** [[Video]](https://www.youtube.com/watch?v=3MoIUXE2eEU) [[Notebook]](t81_559_class_03_3_text_summary.ipynb)\n",
    "* Part 3.4: Text Classification [[Video]](https://www.youtube.com/watch?v=2VpOwFIGmA8) [[Notebook]](t81_559_class_03_4_classification.ipynb)\n",
    "* Part 3.5: LLM Writes a Book [[Video]](https://www.youtube.com/watch?v=iU40Rttlb_Q) [[Notebook]](t81_559_class_03_5_book.ipynb)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "AcAUP0c3hstY"
   },
   "source": [
    "# Google CoLab Instructions\n",
    "\n",
    "The following code ensures that Google CoLab is running and maps Google Drive if needed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "xsI496h5hstZ",
    "outputId": "4012a651-b94b-448e-bcb5-fe7044f7e640"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: using Google CoLab\n",
      "Collecting langchain\n",
      "  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m817.7/817.7 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting langchain_openai\n",
      "  Downloading langchain_openai-0.1.4-py3-none-any.whl (33 kB)\n",
      "Collecting pypdf\n",
      "  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.4/290.4 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n",
      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.29)\n",
      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.5)\n",
      "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n",
      "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n",
      "  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n",
      "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n",
      "  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n",
      "Collecting langchain-community<0.1,>=0.0.32 (from langchain)\n",
      "  Downloading langchain_community-0.0.34-py3-none-any.whl (1.9 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting langchain-core<0.2.0,>=0.1.42 (from langchain)\n",
      "  Downloading langchain_core-0.1.46-py3-none-any.whl (299 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m299.3/299.3 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)\n",
      "  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)\n",
      "Collecting langsmith<0.2.0,>=0.1.17 (from langchain)\n",
      "  Downloading langsmith-0.1.51-py3-none-any.whl (115 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.0/116.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.25.2)\n",
      "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.7.0)\n",
      "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n",
      "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n",
      "Collecting openai<2.0.0,>=1.10.0 (from langchain_openai)\n",
      "  Downloading openai-1.23.6-py3-none-any.whl (311 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.6/311.6 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting tiktoken<1,>=0.5.2 (from langchain_openai)\n",
      "  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m31.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: typing_extensions>=4.0 in /usr/local/lib/python3.10/dist-packages (from pypdf) (4.11.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
      "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n",
      "  Downloading marshmallow-3.21.1-py3-none-any.whl (49 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n",
      "  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n",
      "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain)\n",
      "  Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n",
      "Collecting packaging<24.0,>=23.2 (from langchain-core<0.2.0,>=0.1.42->langchain)\n",
      "  Downloading packaging-23.2-py3-none-any.whl (53 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)\n",
      "  Downloading orjson-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (141 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m141.1/141.1 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.10.0->langchain_openai) (3.7.1)\n",
      "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai<2.0.0,>=1.10.0->langchain_openai) (1.7.0)\n",
      "Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=1.10.0->langchain_openai)\n",
      "  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.10.0->langchain_openai) (1.3.1)\n",
      "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.10.0->langchain_openai) (4.66.2)\n",
      "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.6.0)\n",
      "Requirement already satisfied: pydantic-core==2.18.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.18.1)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.7)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.7)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.2.2)\n",
      "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
      "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<1,>=0.5.2->langchain_openai) (2023.12.25)\n",
      "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=1.10.0->langchain_openai) (1.2.1)\n",
      "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=1.10.0->langchain_openai)\n",
      "  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=1.10.0->langchain_openai)\n",
      "  Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n",
      "  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n",
      "Installing collected packages: pypdf, packaging, orjson, mypy-extensions, jsonpointer, h11, typing-inspect, tiktoken, marshmallow, jsonpatch, httpcore, langsmith, httpx, dataclasses-json, openai, langchain-core, langchain-text-splitters, langchain_openai, langchain-community, langchain\n",
      "  Attempting uninstall: packaging\n",
      "    Found existing installation: packaging 24.0\n",
      "    Uninstalling packaging-24.0:\n",
      "      Successfully uninstalled packaging-24.0\n",
      "Successfully installed dataclasses-json-0.6.4 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 jsonpatch-1.33 jsonpointer-2.4 langchain-0.1.16 langchain-community-0.0.34 langchain-core-0.1.46 langchain-text-splitters-0.0.1 langchain_openai-0.1.4 langsmith-0.1.51 marshmallow-3.21.1 mypy-extensions-1.0.0 openai-1.23.6 orjson-3.10.1 packaging-23.2 pypdf-4.2.0 tiktoken-0.6.0 typing-inspect-0.9.0\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "try:\n",
    "    from google.colab import drive, userdata\n",
    "    COLAB = True\n",
    "    print(\"Note: using Google CoLab\")\n",
    "except:\n",
    "    print(\"Note: not using Google CoLab\")\n",
    "    COLAB = False\n",
    "\n",
    "# OpenAI Secrets\n",
    "if COLAB:\n",
    "    os.environ[\"OPENAI_API_KEY\"] = userdata.get('OPENAI_API_KEY')\n",
    "\n",
    "# Install needed libraries in CoLab\n",
    "if COLAB:\n",
    "    !pip install langchain langchain_openai pypdf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pC9A-LaYhsta"
   },
   "source": [
    "# 3.3: Text Summarization\n",
    "\n",
    "Large Language Models (LLMs) like GPT-4 can be utilized to summarize text by extracting key information and presenting it in a concise format. They work by understanding the context and semantic relationships within the original text and then generating a shorter version that retains the essential messages. This process involves natural language understanding and generation capabilities, allowing LLMs to interpret various types of texts, from technical articles to narratives, and produce summaries that are coherent and relevant. The ability to customize the length and focus of the summary based on user preferences makes LLMs particularly effective for digesting large volumes of information quickly and efficiently.\n",
    "\n",
    "## Summarize Single PDF\n",
    "\n",
    "We will begin by seeing how to summarize a single PDF. LangChang loads document types, such as PDFs, using a document loader. There are document loaders for various data types. The following code summarizes a single PDF using a generic summarization system prompt."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "dfISNTpwOKiZ"
   },
   "outputs": [],
   "source": [
    "from langchain.chains.summarize import load_summarize_chain\n",
    "from langchain.document_loaders import PyPDFLoader, TextLoader\n",
    "from langchain import OpenAI, PromptTemplate\n",
    "from langchain_openai import ChatOpenAI\n",
    "from IPython.display import display_markdown\n",
    "\n",
    "MODEL = 'gpt-4o-mini'\n",
    "\n",
    "llm = ChatOpenAI(\n",
    "        model=MODEL,\n",
    "        temperature=0.2,\n",
    "        n=1\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OLUWbW7NRp4m"
   },
   "source": [
    "\n",
    "The following code snippet demonstrates how to use a specific 'load_summarize_chain' function to set up a summarization process using a Large Language Model (LLM) with a \"map_reduce\" chain type. It starts by loading a PDF from the given URL (\"https://arxiv.org/pdf/1706.03762\") using the 'PyPDFLoader'. The loaded document is then split into manageable parts ('load_and_split'). These parts are fed into the summarization chain ('chain.run(docs)'), which processes and condenses the content. Finally, the summarized content is displayed in markdown format directly within the output environment, ensuring that the formatting of the summary remains intact."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 116
    },
    "id": "pDmTQPXgqQfd",
    "outputId": "37708678-596e-4ace-faf5-329f6a75d03b"
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "The paper \"Attention Is All You Need\" introduces the Transformer model, based on attention mechanisms, which outperforms existing models in machine translation tasks. The model relies on self-attention instead of recurrence, allowing for faster training and improved performance. The model architecture consists of stacked self-attention and fully connected layers in both the encoder and decoder. The study compares the efficiency of self-attention, recurrent, and convolutional layers in sequence transduction tasks, highlighting the advantages of self-attention. The Transformer model achieves better BLEU scores in translation tasks and demonstrates generalization to other tasks. Future research will explore applications beyond text and improve generation efficiency."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
    "\n",
    "url = \"https://arxiv.org/pdf/1706.03762\"\n",
    "loader = PyPDFLoader(url)\n",
    "docs = loader.load_and_split()\n",
    "summary = chain.invoke(docs)['output_text']\n",
    "display_markdown(summary, raw=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "uA1iXHlyMSUU"
   },
   "source": [
    "## Summarize with Custom Prompt\n",
    "\n",
    "LangChain also allows the use of custom system prompts to tailor text summarization according to specific requirements, such as summarizing content in a different language. This flexibility is showcased in the provided code, where a custom prompt template instructs the system to write a concise summary in Spanish. The template is set up to include a placeholder for the text that needs summarizing, followed by an instruction in Spanish to produce a summary. This custom prompt is then incorporated into the summarization process by configuring both the 'map_prompt' and 'combine_prompt' parameters of the 'load_summarize_chain' function. The process begins by downloading a PDF from a specified URL using 'PyPDFLoader', splitting the document into sections, and then applying the summarization chain with the custom prompt to generate a summarized output in Spanish. The summarized content is then displayed in markdown format to maintain proper formatting. This example illustrates the adaptability of LangChain in handling complex summarization tasks that include language-specific instructions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 133
    },
    "id": "lpQmmSiNMZNY",
    "outputId": "6dbf3e1f-a34c-48cb-fc99-16560ddf9bba"
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "El artículo presenta el modelo Transformer, una arquitectura de red neuronal basada en mecanismos de atención sin recurrencia ni convoluciones. El Transformer ha demostrado ser superior en calidad, más paralelizable y más rápido de entrenar en tareas de traducción automática. Se han logrado resultados sobresalientes en tareas de traducción de inglés a alemán y francés, así como en análisis de constituyentes en inglés. Se utilizan técnicas como la atención escalada de producto punto y la atención de múltiples cabezas para mejorar el rendimiento del modelo. Se destaca que el Transformer es más eficiente en términos de complejidad computacional y longitud máxima de la ruta para aprender dependencias a largo plazo en secuencias. Se menciona que el modelo ha establecido un nuevo estado del arte en tareas de traducción y se planea extender su uso a otras modalidades de entrada y salida."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "TEMPLATE = \"\"\"\n",
    "Write a concise summary of the information presented. Write the summary in Spanish.\n",
    "\n",
    "{text}\n",
    "\n",
    "SUMMARY:\"\"\"\n",
    "PROMPT = PromptTemplate(template=TEMPLATE, input_variables=[\"text\"])\n",
    "\n",
    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\", map_prompt=PROMPT, combine_prompt=PROMPT)\n",
    "\n",
    "url = \"https://arxiv.org/pdf/1706.03762\"\n",
    "loader = PyPDFLoader(url)\n",
    "docs = loader.load_and_split()\n",
    "summary = chain.invoke(docs)['output_text']\n",
    "display_markdown(summary, raw=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "D3MVbKWNsLL3"
   },
   "source": [
    "## Summarize Multiple PDFs\n",
    "\n",
    "We will now see how to summarize multiple documents into one. We will summarize the following four papers, each of which is very important to the field of GenAI.\n",
    "\n",
    "* \"[Attention Is All You Need](https://arxiv.org/pdf/1706.03762)\" by Ashish Vaswani et al. (2017) - This paper introduced the Transformer architecture, which has become the backbone of most modern natural language processing systems, including text-to-text generative models like GPT and BERT.\n",
    "* \"[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805)\" by Jacob Devlin et al. (2018) - BERT (Bidirectional Encoder Representations from Transformers) revolutionized the way contextual information is handled by using a bidirectional training of Transformer models. This methodology significantly improved the performance of models on various NLP tasks.\n",
    "* \"[Language Models are Few-Shot Learners](https://arxiv.org/pdf/2005.14165)\" by Tom B. Brown et al. (2020) - Also known as the GPT-3 paper, it explores the capabilities of very large transformer-based models, demonstrating that scaling up the size of the models improves performance across a broad spectrum of NLP tasks, often requiring little to no task-specific data.\n",
    "* \"[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683)\" by Colin Raffel et al. (2019) - This paper introduces T5 (Text-to-Text Transfer Transformer), which converts all NLP tasks into a unified text-to-text format, simplifying the application of transfer learning across different tasks.\n",
    "\n",
    "We use the same process demonstrated to load all these PDF documents and concatenate their summaries into an array."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "wIVM9OwssPZm",
    "outputId": "eb0c7a93-68fc-42ff-c5f0-440395757179"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading: https://arxiv.org/pdf/1706.03762\n",
      "Reading: https://arxiv.org/pdf/1810.04805\n",
      "Reading: https://arxiv.org/pdf/2005.14165\n",
      "Reading: https://arxiv.org/pdf/1910.10683\n"
     ]
    }
   ],
   "source": [
    "urls = [\n",
    "  \"https://arxiv.org/pdf/1706.03762\",\n",
    "  \"https://arxiv.org/pdf/1810.04805\",\n",
    "  \"https://arxiv.org/pdf/2005.14165\",\n",
    "  \"https://arxiv.org/pdf/1910.10683\"\n",
    "]\n",
    "\n",
    "summaries = []\n",
    "\n",
    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
    "\n",
    "for url in urls:\n",
    "  print(f\"Reading: {url}\")\n",
    "  loader = PyPDFLoader(url)\n",
    "  docs = loader.load_and_split()\n",
    "  chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
    "  summary = chain.invoke(docs)['output_text']\n",
    "  summaries.append(summary)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "QE79kzVUTZ15"
   },
   "source": [
    "After obtaining individual summaries of articles, the next step involves combining these summaries into a single, comprehensive overview. The provided code accomplishes this by first merging all the initial summaries into one long string. To manage the potentially large amount of text, it uses the CharacterTextSplitter class from LangChain to split this combined text into manageable chunks. Each chunk maintains a size of 500 characters with an overlap of 100 characters to ensure continuity and context are preserved across chunks. These chunks are then converted into Document objects, each holding a segment of the summarized text. A new summarization chain is loaded using the same 'map_reduce' model to process these document objects. This chain effectively runs across the segmented texts, extracting key information and producing a final, condensed summary of the combined initial summaries. Finally, this ultimate summary is displayed in markdown format to maintain clarity and formatting, providing a clear and succinct synthesis of the original articles' content."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 98
    },
    "id": "cjpShxq87RvJ",
    "outputId": "1baf5d03-fc50-4710-c61a-12baf74450d5"
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "The paper \"Attention Is All You Need\" introduces the Transformer model for sequence transduction tasks, achieving state-of-the-art results in machine translation with less training time. It utilizes self-attention for sequence modeling, eliminating the need for recurrent or convolutional networks. The paper also discusses the introduction and performance of BERT, T5, and GPT-3 models for natural language processing tasks, highlighting their capabilities and potential advancements. Challenges and limitations, such as biases and misuse, are also addressed."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.document_loaders import TextLoader\n",
    "from langchain.schema.document import Document\n",
    "\n",
    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
    "\n",
    "summary_str = \" \".join(summaries)\n",
    "text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n",
    "texts = text_splitter.split_text(summary_str)\n",
    "docs = [Document(page_content=t) for t in texts]\n",
    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
    "final_summary = chain.invoke(docs)['output_text']\n",
    "display_markdown(final_summary, raw=True)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3.11 (genai)",
   "language": "python",
   "name": "genai"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
